#loading packages 
library(ezids)
#library(ggplot2)
library(ggrepel)
library(gridExtra)
#library(tibble)
library(tidyverse)
library(corrplot)
library(lattice)
library(psych)
library(FNN)
library(gmodels)
library(caret)

#loading data 
NYweath <- data.frame(read.csv("data/NYC_weather_1869_2022.csv"))

#converting to R date format and adding columns for day, month, and year
NYweath$DATE <- as.Date(NYweath$DATE)
NYweath$day <- format(NYweath$DATE, format="%d")
NYweath$month <- format(NYweath$DATE, format="%m")
NYweath$year <- format(NYweath$DATE, format="%Y")

#converting temperature observations to numerical
NYweath$TMAX <- as.numeric(NYweath$TMAX)
NYweath$TMIN <- as.numeric(NYweath$TMIN)
NYweath$TAVG <- as.numeric(NYweath$TAVG)
NYweath$year <- as.numeric(NYweath$year)

#Making month a factor
NYweath$month <- as.factor(NYweath$month)

# subset data to desired variables
NYweath_sub <- subset(NYweath, select = c(DATE, day, month, year, TMAX, TMIN, TAVG, PRCP, SNOW)) 

#creating a subset for 1900 on
NYweath_00 <- subset(NYweath_sub, year > 1899)
xkabledplyhead(NYweath_00)
Head
DATE day month year TMAX TMIN TAVG PRCP SNOW
11265 1900-01-01 01 01 1900 20 14 NA 0.03 0.5
11266 1900-01-02 02 01 1900 23 13 NA 0.00 0.0
11267 1900-01-03 03 01 1900 26 19 NA 0.00 0.0
11268 1900-01-04 04 01 1900 32 19 NA 0.00 0.0
11269 1900-01-05 05 01 1900 39 29 NA 0.00 0.0

Analysis Questions

  1. Logistic regression to predict rainy day based on TMAX, TMIN, MONTH, DAY?
  2. Linear correlations between Air Quality and Weather variables
  3. Can we predict the month based on weather variables (kNN)?

Logitic Model Preparation

First, we format our data to create a variable representing whether it rained on a given day, starting :

# add column based on condition of rain on a given day
NYweath.rain <- NYweath_00
NYweath.rain$rained <- ifelse(NYweath.rain$PRCP > 0.0, 1, 0)
NYweath.rain$rained <- as.factor(NYweath.rain$rained)
xkablesummary(NYweath.rain)
Table: Statistics summary.
DATE day month year TMAX TMIN TAVG PRCP SNOW rained
Min Min. :1900-01-01 Length:44829 01 : 3813 Min. :1900 Min. : 2.0 Min. :-15.0 Min. : 0 Min. :0.00 Min. : 0.0 0:29965
Q1 1st Qu.:1930-09-08 Class :character 03 : 3813 1st Qu.:1930 1st Qu.: 47.0 1st Qu.: 34.0 1st Qu.:43 1st Qu.:0.00 1st Qu.: 0.0 1:14864
Median Median :1961-05-15 Mode :character 05 : 3813 Median :1961 Median : 63.0 Median : 48.0 Median :57 Median :0.00 Median : 0.0 NA
Mean Mean :1961-05-15 NA 07 : 3813 Mean :1961 Mean : 62.1 Mean : 47.2 Mean :56 Mean :0.13 Mean : 0.1 NA
Q3 3rd Qu.:1992-01-20 NA 08 : 3813 3rd Qu.:1992 3rd Qu.: 78.0 3rd Qu.: 62.0 3rd Qu.:71 3rd Qu.:0.05 3rd Qu.: 0.0 NA
Max Max. :2022-09-26 NA 10 : 3782 Max. :2022 Max. :106.0 Max. : 87.0 Max. :92 Max. :7.57 Max. :27.3 NA
NA NA NA (Other):21982 NA NA NA NA’s :42181 NA NA’s :96 NA
xkabledplyhead(NYweath.rain)
Head
DATE day month year TMAX TMIN TAVG PRCP SNOW rained
11265 1900-01-01 01 01 1900 20 14 NA 0.03 0.5 1
11266 1900-01-02 02 01 1900 23 13 NA 0.00 0.0 0
11267 1900-01-03 03 01 1900 26 19 NA 0.00 0.0 0
11268 1900-01-04 04 01 1900 32 19 NA 0.00 0.0 0
11269 1900-01-05 05 01 1900 39 29 NA 0.00 0.0 0

First, let’s create a two-way contingency table To study the effects on rainy day by the factor month and make sure there are no cells of zero frequencies.

rainy_vs_month = xtabs(~ rained + month, data = NYweath.rain)
rainy_vs_month
##       month
## rained   01   02   03   04   05   06   07   08   09   10   11   12
##      0 2476 2297 2452 2324 2428 2393 2545 2597 2653 2770 2551 2479
##      1 1337 1177 1361 1366 1385 1297 1268 1216 1033 1012 1109 1303

We can then quickly run a chi-squared test to see if the two are independent (or same frequency distribution).

chisq.result = chisq.test(rainy_vs_month)
chisq.result
## 
##  Pearson's Chi-squared test
## 
## data:  rainy_vs_month
## X-squared = 200, df = 11, p-value <2e-16

Based on the result, the factor variables rained and month are dependent - there is a statistically significant relationship between the two. We can use the month variable in our logistic regression to predict the rained outcome for a day.

The Logistic Model

Let’s jump to the logistic regression to predict a precipitation event on a given day:

rainedLogit <- glm(rained ~ month + TMAX + TMIN, data = NYweath.rain, family = "binomial")
summary(rainedLogit)
## 
## Call:
## glm(formula = rained ~ month + TMAX + TMIN, family = "binomial", 
##     data = NYweath.rain)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -1.681  -0.901  -0.696   1.179   2.919  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -0.28924    0.05857   -4.94  7.9e-07 ***
## month02      0.03407    0.05171    0.66     0.51    
## month03      0.02084    0.05141    0.41     0.69    
## month04     -0.06767    0.05626   -1.20     0.23    
## month05     -0.34685    0.06373   -5.44  5.3e-08 ***
## month06     -0.77503    0.07242  -10.70  < 2e-16 ***
## month07     -1.10573    0.07803  -14.17  < 2e-16 ***
## month08     -1.18707    0.07681  -15.45  < 2e-16 ***
## month09     -1.18831    0.07163  -16.59  < 2e-16 ***
## month10     -0.97082    0.06243  -15.55  < 2e-16 ***
## month11     -0.66497    0.05509  -12.07  < 2e-16 ***
## month12     -0.25122    0.05069   -4.96  7.2e-07 ***
## TMAX        -0.10397    0.00210  -49.49  < 2e-16 ***
## TMIN         0.13821    0.00251   55.00  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 56958  on 44828  degrees of freedom
## Residual deviance: 53249  on 44815  degrees of freedom
## AIC: 53277
## 
## Number of Fisher Scoring iterations: 4
xkabledply(rainedLogit, title = paste("Logistic Regression :", format(formula(rainedLogit)) ))
Logistic Regression : rained ~ month + TMAX + TMIN
Estimate Std. Error z value Pr(>|z|)
(Intercept) -0.2892 0.0586 -4.938 0.000
month02 0.0341 0.0517 0.659 0.510
month03 0.0208 0.0514 0.405 0.685
month04 -0.0677 0.0563 -1.203 0.229
month05 -0.3469 0.0637 -5.442 0.000
month06 -0.7750 0.0724 -10.702 0.000
month07 -1.1057 0.0780 -14.170 0.000
month08 -1.1871 0.0768 -15.454 0.000
month09 -1.1883 0.0716 -16.590 0.000
month10 -0.9708 0.0624 -15.550 0.000
month11 -0.6650 0.0551 -12.071 0.000
month12 -0.2512 0.0507 -4.956 0.000
TMAX -0.1040 0.0021 -49.485 0.000
TMIN 0.1382 0.0025 55.005 0.000

All the coefficients are found significant (small p-values) except for the months of February, March, and April. TMAX has a negative effect on precipitation while TMIN has a positive effect. The months with significant coefficients all have a negative effect on the rain outcome for a given day.

Let’s obtain the growth/decay factors for each explanatory variables. The factors are the exponentials of the coefficients:

expcoeff = exp(coef(rainedLogit))
xkabledply(as.table(expcoeff), title = "Exponential of coefficients in rained Logit Reg")
Exponential of coefficients in rained Logit Reg
x
(Intercept) 0.749
month02 1.035
month03 1.021
month04 0.935
month05 0.707
month06 0.461
month07 0.331
month08 0.305
month09 0.305
month10 0.379
month11 0.514
month12 0.778
TMAX 0.901
TMIN 1.148

Compared to January, the months of February and March have a positive effect on the probability of a rainy day. Each gain in the minimum daily temperature value also has a positive effect on chances of rain.

Logistic Model Evaluation

Confusion matrix

library(ModelMetrics)
library(scales)

rainedLogit.confusion <- ModelMetrics::confusionMatrix(
  actual = rainedLogit$y,
  predicted = rainedLogit$fitted.values)

xkabledply(rainedLogit.confusion,
  title = "Confusion matrix for logit model")
Confusion matrix for logit model
1 2
28093 11255
1872 3609
accuracy <- (rainedLogit.confusion[4] + rainedLogit.confusion[1]) / nrow(NYweath.rain)
precision <- rainedLogit.confusion[4] / (rainedLogit.confusion[4] + rainedLogit.confusion[3])

The confusion matrix above was generated for the cutoff value of 0.5.

The accuracy of this logistic model is approx. 0.71 which shows that out of all predictions made by our model on whether it rained or not, 71% were correct.

The precision value signals that it actually only rained on 24% of the days out of all days the model predicted it would rain.

Mcfadden’s Test

rainedNullLogit <- glm(rained ~ 1, data = NYweath.rain, family = "binomial")
mcFadden = 1 - logLik(rainedLogit)/logLik(rainedNullLogit)
mcFadden
## 'log Lik.' 0.0651 (df=14)

The McFadden test outputs a pseudo-R-square value of 0.065 which demonstrates that only about 6.5% of the variation in rainy day outcomes is explained by our regressors. This is not a significant outcome for the likelihood that the model is correct at predicting rainy days.

Hosmer and Lemeshow test

The Hosmer and Lemeshow Goodness of Fit test can be used to evaluate logistic regression fit.

library(ResourceSelection)
# Hosmer and Lemeshow test, a chi-squared test
rainedLogitHoslem = hoslem.test(NYweath.rain$rained, fitted(rainedLogit))
rainedLogitHoslem
## 
##  Hosmer and Lemeshow goodness of fit (GOF) test
## 
## data:  NYweath.rain$rained, fitted(rainedLogit)
## X-squared = 44829, df = 8, p-value <2e-16

The p-value of 0 is very low which indicates that the model is a good fit.

ROC/AUC

library(pROC)
probabilities <- predict(rainedLogit, type = "response")
# add probabilities as column
NYweath.rain$probs <- probabilities

rainedLogitROC <- roc(rained~probs, data=NYweath.rain)
plot(rainedLogitROC)

The area-under-curve of the ROC plot is 0.68, which is less than the significance value of 0.8. The true positive rate of our model measure is about 68%.

Overall, I think we should attempt to find a different combination of predictors for this model.

NYC daily air quality data analysis

Let’s pull in our air quality data. It contains measurements of daily PM2.5 and air quality index values taken from various locations around Manhattan.

PM2.5 includes particles less than or equal to 2.5 micrometers and is also called fine particle pollution. The AQI is an index for reporting daily air quality. It tells how clean or polluted the air is, and what associated health effects might be a concern, especially for ground-level ozone and particle pollution.

Let’s load the new data and have a look at it’s structure:

DailyAQ_00_22 <- data.frame(read.csv("data/daily-AQ-NY-00-20.csv"))
DailyAQ_00_22 <- DailyAQ_00_22[c('Date', 'Daily.Mean.PM2.5.Concentration', 'DAILY_AQI_VALUE')]
colnames(DailyAQ_00_22) <- c('DATE', 'PM2.5', 'AQI')
str(DailyAQ_00_22)
## 'data.frame':    7097 obs. of  3 variables:
##  $ DATE : chr  "1/1/00" "1/2/00" "1/3/00" "1/4/00" ...
##  $ PM2.5: num  39.8 41.3 30.8 16.4 7.8 23.1 18.2 27 19.4 9.7 ...
##  $ AQI  : int  112 115 90 60 33 74 64 82 66 40 ...
xkablesummary(DailyAQ_00_22)
Table: Statistics summary.
DATE PM2.5 AQI
Min Length:7097 Min. :-1.2 Min. : 0
Q1 Class :character 1st Qu.: 5.9 1st Qu.: 25
Median Mode :character Median : 9.0 Median : 38
Mean NA Mean :10.9 Mean : 41
Q3 NA 3rd Qu.:13.6 3rd Qu.: 54
Max NA Max. :86.0 Max. :167
xkabledplyhead(DailyAQ_00_22)
Head
DATE PM2.5 AQI
1/1/00 39.8 112
1/2/00 41.3 115
1/3/00 30.8 90
1/4/00 16.4 60
1/5/00 7.8 33

We need to convert the date from a character string to an R type. We also calculate year-over-year growth rates for both daily PM2.5 and AQI and store them in a column.

We have about 7,000 observations between the years 2000 and 2022. A few plots to help us visualize the data:

# distribution plot of pmi2.5 and daily AQI
mean_pm25 <- mean(DailyAQ_00_22$PM2.5)
mean_aqi <- mean(DailyAQ_00_22$AQI)

# TODO: combine plots into one frame
ggplot(DailyAQ_00_22) +
  geom_histogram(aes(x=PM2.5), na.rm=TRUE, alpha=0.5, color="black", fill='#BD2AE2', bins=100, binwidth=2) +
  geom_vline(xintercept=mean_pm25, color="black", size=1, linetype=5, show.legend=FALSE) +
  annotate("text", x=mean_pm25 + 9, y=1000, label=paste(round(mean_pm25, 2)), angle=0, size=4, color="black") +
  labs(title="Distribution of Daily PM2.5 Measurements", x="ug/m3 LC", y="Count")

ggplot(DailyAQ_00_22) +
  geom_histogram(aes(x=AQI), na.rm=TRUE, alpha=0.5, color="black", fill='#2DD164', bins=50, binwidth=5) +
  geom_vline(xintercept=mean_aqi, color="black", size=1, linetype=5, show.legend=FALSE) +
  annotate("text", x=mean_aqi + 9, y=625, label=paste(round(mean_aqi, 2)), angle=0, size=4, color="black") +
  labs(title="Distribution of Daily AQI Level", x="", y="Count")

# TODO: group these in same figure, separate plots
ggplot(DailyAQ_00_22_Yearly_Growth, aes(group=1)) +
  geom_line(aes(x = year, y = pm2.5_diffRate), na.rm = T, stat = "identity", color="#290DDA", size=1) +
  geom_point(aes(x = year, y = pm2.5_diffRate), na.rm = TRUE, fill="#124CF2", shape = 23) +
  labs(title="PM2.5 particulate year-over-year rate in NYC", x="Year", y="ug/m3 LC") +
  theme(
    axis.title.y = element_text(color = "#043008", size = 13),
    axis.title.y.right = element_text(color = "#E6E930", size = 13)
  )

ggplot(DailyAQ_00_22_Yearly_Growth, aes(group=1)) +
  geom_line(aes(x = year, y = aqi_diffRate), na.rm = T, stat="identity", color="#043008", size=1) +
  geom_point(aes(x = year, y = aqi_diffRate), na.rm = TRUE, fill="#E6E930", shape = 23) +
  labs(title="AQI year-over-year rate in NYC", x="Year", y="ug/m3 LC") +
  theme(
    axis.title.y = element_text(color = "#043008", size = 13),
    axis.title.y.right = element_text(color = "#E6E930", size = 13)
  )

# ggplot(DailyAQ_00_22_Yearly_Growth, aes(group=1)) +
#   labs(title="AQI and growth rate and growht/decay rate by year in NYC", x="Year", y="ug/m3 LC") +
#   scale_y_continuous(sec.axis = sec_axis(~., name="Year-over-year Diff (%)")) +
#   theme(
#     axis.title.y = element_text(color = "#2DD164", size = 13),
#     axis.title.y.right = element_text(color = "#E6E930", size = 13)
#   )

Next, we combine our new dataset with the NYC weather data based on the date. The days without a matching air quality measurement will be dropped after merge.

# merge data frame by date
DailyAQ_merged <- merge(DailyAQ_00_22, NYweath_00, by="DATE")
# select required columns
DailyAQ_merged <- DailyAQ_merged[ , c('DATE', 'year.x', 'month', 'PM2.5', 'AQI', 'TMAX', 'TMIN', 'PRCP', 'SNOW')]
colnames(DailyAQ_merged)[2] <- "year"
str(DailyAQ_merged)
## 'data.frame':    7063 obs. of  9 variables:
##  $ DATE : Date, format: "2000-01-01" "2000-01-02" ...
##  $ year : num  2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 ...
##  $ month: Factor w/ 12 levels "01","02","03",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ PM2.5: num  39.8 41.3 30.8 16.4 7.8 23.1 18.2 27 19.4 9.7 ...
##  $ AQI  : int  112 115 90 60 33 74 64 82 66 40 ...
##  $ TMAX : num  50 60 64 60 47 49 38 51 58 52 ...
##  $ TMIN : num  34 43 51 46 29 35 30 37 44 40 ...
##  $ PRCP : num  0 0 0 0.7 0 0 0 0.02 0.84 0 ...
##  $ SNOW : num  0 0 0 0 0 0 0 0 0 0 ...
xkablesummary(DailyAQ_merged)
Table: Statistics summary.
DATE year month PM2.5 AQI TMAX TMIN PRCP SNOW
Min Min. :2000-01-01 Min. :2000 03 : 634 Min. :-1.2 Min. : 0.0 Min. : 13.0 Min. :-1.0 Min. :0.00 Min. : 0.00
Q1 1st Qu.:2007-02-06 1st Qu.:2007 01 : 617 1st Qu.: 5.9 1st Qu.: 25.0 1st Qu.: 48.0 1st Qu.:36.0 1st Qu.:0.00 1st Qu.: 0.00
Median Median :2012-03-16 Median :2012 05 : 615 Median : 9.1 Median : 38.0 Median : 64.0 Median :48.0 Median :0.00 Median : 0.00
Mean Mean :2011-12-20 Mean :2011 04 : 611 Mean :10.9 Mean : 41.1 Mean : 62.6 Mean :48.4 Mean :0.13 Mean : 0.09
Q3 3rd Qu.:2017-05-27 3rd Qu.:2017 12 : 610 3rd Qu.:13.6 3rd Qu.: 54.0 3rd Qu.: 79.0 3rd Qu.:63.0 3rd Qu.:0.05 3rd Qu.: 0.00
Max Max. :2022-09-26 Max. :2022 07 : 579 Max. :86.0 Max. :167.0 Max. :103.0 Max. :83.0 Max. :7.57 Max. :27.30
NA NA NA (Other):3397 NA NA NA NA NA NA

Linear Model with daily air quality and weather variables

# subset to numerical variables
DailyAQ_numerics <- DailyAQ_merged[ , c('PM2.5', 'AQI', 'TMAX', 'TMIN', 'PRCP', 'SNOW', 'year')]
# combine PRCP and SNOW into single value
#DailyAQ_numerics$PRCP <- DailyAQ_numerics$PRCP + DailyAQ_numerics$SNOW
#DailyAQ_numerics <- subset(DailyAQ_numerics, select = -c(SNOW))
DailyAQ_numerics$year <- DailyAQ_numerics$year - 2000 

Correlation analysis

Lattice pairs plot

pairs(DailyAQ_numerics)

pairs.panels(DailyAQ_numerics, 
  method = "pearson", # correlation method
  hist.col = "red", # set histogram color
  density = TRUE,  # show density plots
  ellipses = TRUE, # show correlation ellipses
  smoother = TRUE,
  lm = TRUE,
  main = "Pairs Plot Of Weather and Air Quality Numerical Variables",
  cex.labels=0.75
)

Another way to look at correlation using the corrplot function:

DailyAQ_cor <- cor(DailyAQ_numerics)
corrplot(DailyAQ_cor, method="number", title="Correlation Plot Of Weather and Air Quality Numerical Variables", mar = c(2, 2, 2, 2))

From the pearson correlation plot above, we can see a significantly large, positive correlation between PM2.5 concentrations and the daily AQI values. This is expected as PM2.5 are heavily weighed in calculations of AQI. Unfortunately, the correlation significance among our weather and air quality variables is relatively weak. However, we will still attempt a linear model between them below.

# yearly average and year-over year growth of daily AQI and PM2.5
ggplot(DailyAQ_00_22_Yearly_Growth) +
  geom_line(aes(x = year, y = aqi_avg), stat="identity", color="#2DD164", size=1) +
  geom_point(aes(x = year, y = aqi_avg), na.rm = TRUE, fill="#457108", shape = 21) +
  labs(title="Average AQI by year in NYC", x="Year", y="AQI value")

ggplot(DailyAQ_00_22_Yearly_Growth) +
  geom_line(aes(x = year, y = pm2.5_avg), stat="identity", color="#BD2AE2", size=1) +
  geom_point(aes(x = year, y = pm2.5_avg), na.rm = TRUE, fill="#124CF2", shape = 21) +
  labs(title="Average PM2.5 particulate amount by year in NYC", x="Year", y="Year-over-year Diff (%)")

Linear models

Let’s start by creating a linear model to describe the relationship between AQI and year.

aqi_fit <- lm(AQI ~ year, data = DailyAQ_numerics)
summary(aqi_fit)
## 
## Call:
## lm(formula = AQI ~ year, data = DailyAQ_numerics)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -50.34 -13.11  -1.49  11.06 126.83 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  61.4369     0.4497   136.6   <2e-16 ***
## year         -1.7748     0.0342   -51.9   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 18.4 on 7061 degrees of freedom
## Multiple R-squared:  0.276,  Adjusted R-squared:  0.276 
## F-statistic: 2.69e+03 on 1 and 7061 DF,  p-value: <2e-16
xkabledply(aqi_fit, title = paste("First Linear Model: ", format( formula(aqi_fit) )))
First Linear Model: AQI ~ year
Estimate Std. Error t value Pr(>|t|)
(Intercept) 61.44 0.4497 136.6 0
year -1.77 0.0342 -51.8 0

The coefficient for the date regressor is significant, and has a negative effect on daily AQI by a very small factor of .005. Although the p-value of the F-statistic is significant, date still only explains 28% of the variability in daily AQI measurements.

ggplot(DailyAQ_00_22, aes(x = year, y = AQI)) + 
  geom_point(alpha = 0.5, color = "#2DD164", position = "jitter") +
  labs(x = "Year", y = "AQI Value", title = "Daily AQI Values From 2000-2022 With Trend Line") +
  geom_smooth(method = 'lm', formula = 'y ~ x', color = "black", fill="black")

The plot displays a slightly downward treng in daily AQI, but there is a lot of noise distorting the fit.

Adding month as a categorical regressor

In our first analysis, we analyzed linear trends of TMAX over time and determined a slight positive correlation observed over the years 1900-2022. Based on that fit, we hypothesized that seasonality trends had an impact on model performance.

We believe seasonality also effects daily AQI measurements.

# NYC weather - Avg TMAX by month
NYweath_Monthly_Avg <- NYweath_00 %>%
  group_by(month) %>%
  summarize(avg_max_temp = mean(TMAX, na.rm=T),
            avg_min_temp = mean(TMIN, na.rm=T))


ggplot(NYweath_Monthly_Avg, aes(x = as.numeric(month), y = avg_max_temp)) +
  geom_line(color="#F21E1E", size = 2) +
  geom_point(na.rm = TRUE, fill="#126BF4", shape = 21, size = 4) +
  labs(title="Average TMAX By Month in NYC", x="Month", y="Temperature (°F)") +
  scale_x_continuous(name = "Month",
                     breaks = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12))

DailyAQ_monthly <- DailyAQ_merged %>%
  group_by(month) %>%
  summarize(pm2.5_avg = mean(PM2.5, na.rm=T),
            aqi_avg = mean(AQI, na.rm=T))

# calculate growth/decay rates month-over-month
DailyAQ_monthly <- DailyAQ_monthly %>%
  mutate(pm2.5_diffRate = ((pm2.5_avg - lag(pm2.5_avg)) / pm2.5_avg) * 100,
         aqi_diffRate = ((aqi_avg - lag(aqi_avg)) / aqi_avg) * 100
      )
# populate January rates based on December
DailyAQ_monthly[1, 4]$pm2.5_diffRate <- ((DailyAQ_monthly$pm2.5_avg[1] - DailyAQ_monthly$pm2.5_avg[12]) /  DailyAQ_monthly$pm2.5_avg[1]) * 100
DailyAQ_monthly[1, 5]$aqi_diffRate <- ((DailyAQ_monthly$aqi_avg[1] - DailyAQ_monthly$aqi_avg[12]) /  DailyAQ_monthly$aqi_avg[1]) * 100

# yearly average and year-over year growth of daily AQI and PM2.5
# TODO: combine with month-over-month change plot
ggplot(DailyAQ_monthly, aes(x = as.numeric(month), y = aqi_avg)) +
  geom_line(color="#47ABE9", size = 2) +
  geom_point(na.rm = TRUE, fill="#C10808", shape = 21, size = 4) +
  labs(title="Average AQI By Month in NYC", x="Month", y="AQI") +
  scale_x_continuous(name = "Month",
                     breaks = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12))

ggplot(DailyAQ_monthly, aes(x = as.numeric(month), y = aqi_diffRate)) +
  geom_line(na.rm = TRUE, stat="identity", color="#043008", size=2) +
  geom_point(na.rm = TRUE, fill="#E6E930", shape = 21) +
  labs(title="Average AQI month-over-month change rate", x="Month", y="AQI") +
  scale_x_continuous(name = "Month",
                     breaks = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12))

Let’s modify our last model to attempt to fit seasonality by adding month as a categorical regressor and our variable-of-interest from the last project - TMAX - to predict daily AQI.

aqi_fit2 <- lm(AQI ~ TMAX + month, data = DailyAQ_merged)
summary(aqi_fit2)
## 
## Call:
## lm(formula = AQI ~ TMAX + month, data = DailyAQ_merged)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -43.72 -14.66  -3.03  11.52 124.66 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  24.9722     1.3444   18.58  < 2e-16 ***
## TMAX          0.6781     0.0271   25.03  < 2e-16 ***
## month02      -5.9768     1.1604   -5.15  2.7e-07 ***
## month03     -19.3975     1.1672  -16.62  < 2e-16 ***
## month04     -33.0288     1.2871  -25.66  < 2e-16 ***
## month05     -37.9619     1.4304  -26.54  < 2e-16 ***
## month06     -38.8086     1.5925  -24.37  < 2e-16 ***
## month07     -37.7940     1.6857  -22.42  < 2e-16 ***
## month08     -40.4565     1.6563  -24.43  < 2e-16 ***
## month09     -43.7962     1.5422  -28.40  < 2e-16 ***
## month10     -34.8408     1.3458  -25.89  < 2e-16 ***
## month11     -19.9973     1.2209  -16.38  < 2e-16 ***
## month12      -7.9325     1.1470   -6.92  5.1e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 20 on 7050 degrees of freedom
## Multiple R-squared:  0.149,  Adjusted R-squared:  0.147 
## F-statistic:  103 on 12 and 7050 DF,  p-value: <2e-16
xkabledply(aqi_fit2, title = paste("Second Linear Model: ", format( formula(aqi_fit2) )))
Second Linear Model: AQI ~ TMAX + month
Estimate Std. Error t value Pr(>|t|)
(Intercept) 24.972 1.3444 18.58 0
TMAX 0.678 0.0271 25.03 0
month02 -5.977 1.1604 -5.15 0
month03 -19.398 1.1672 -16.62 0
month04 -33.029 1.2871 -25.66 0
month05 -37.962 1.4304 -26.54 0
month06 -38.809 1.5925 -24.37 0
month07 -37.794 1.6857 -22.42 0
month08 -40.456 1.6563 -24.43 0
month09 -43.796 1.5422 -28.40 0
month10 -34.841 1.3458 -25.89 0
month11 -19.997 1.2209 -16.38 0
month12 -7.933 1.1470 -6.92 0

The regression coefficient for TMAX is significant and positively correlated, with each degree increase resulting in AQI increasing by a factor of 0.68. All months, when compared to January, have a negative impact on AQI, with September having the largest difference. The p-value of the model’s F-statistic is also significant, allowing us to reject the null hypothesis and conclude that there’s a significant relationship between our chosen predictors and the daily AQI value. However, the \(R^2\) for our model is only .149, which indicates that only 14.7% of the variation in daily AQI can be explained by TMAX and month.

Seasonality can cause a poor linear model. Properly testing it would require developing a seasonality time-series model to properly fit the data.

Check for multicollinearity

# model VIF scores
xkablevif(aqi_fit2, title = "Model 2 VIF")
Model 2 VIF
month02 month03 month04 month05 month06 month07 month08 month09 month10 month11 month12 TMAX
1.78 1.98 2.32 2.88 3.36 3.79 3.59 3.01 2.38 1.96 1.84 4.25

The VIF values of all regressors are acceptable.

k-NN model to predict month based on weather and air quality data

A k-NN model can help us further analyze the seasonality effect by attempting to predict the month based on AQI and TMAX variables.

Evaluate relationships via scatter plots - scale and center data
- scatter plots of AQI,TMAX
- check composition of labels (months)

Plot variables

ggplot(DailyAQ_merged) +
    geom_point(aes(x=AQI, y=TMAX, color=month), alpha = 0.7) +
    labs(title = "Daily Maximum Temperature vs Daily Air Quality Index Value Distinguished By Month",
         x = "Daily AQI Value",
         y = "Daily Maximum Temperature (F)") +
  scale_color_brewer(palette = "Paired")

Center and scale our data

DailyAQ_scaled <- as.data.frame(scale(DailyAQ_merged[4:9], center = TRUE, scale = TRUE))
str(DailyAQ_scaled)
## 'data.frame':    7063 obs. of  6 variables:
##  $ PM2.5: num  3.839 4.039 2.642 0.727 -0.417 ...
##  $ AQI  : num  3.282 3.421 2.264 0.876 -0.373 ...
##  $ TMAX : num  -0.6996 -0.1462 0.0752 -0.1462 -0.8656 ...
##  $ TMIN : num  -0.872 -0.328 0.155 -0.147 -1.173 ...
##  $ PRCP : num  -0.356 -0.356 -0.356 1.502 -0.356 ...
##  $ SNOW : num  -0.113 -0.113 -0.113 -0.113 -0.113 ...

Create train and test data sets with 4:1 splits, as well as label sets.

set.seed(1000)
DailyAQ_sample <- sample(2, nrow(DailyAQ_scaled), replace=TRUE, prob=c(0.80, 0.20))

DailyAQ_training <- DailyAQ_scaled[DailyAQ_sample == 1, ]
DailyAQ_test <- DailyAQ_scaled[DailyAQ_sample == 2, ]

DailyAQ_trainLabels <- DailyAQ_merged[DailyAQ_sample == 1, 3]
DailyAQ_testLabels <- DailyAQ_merged[DailyAQ_sample == 2, 3]

nrow(DailyAQ_training)
## [1] 5664
nrow(DailyAQ_test)
## [1] 1399

Build kNN model

# set kval
kval <- 3

# build model
DailyAQ_pred <- knn(train = DailyAQ_training,
                    test = DailyAQ_test,
                    cl = DailyAQ_trainLabels,
                    k = kval)

# confusion matrix
DailyAQ_confusionMatrix <- caret::confusionMatrix(DailyAQ_pred, reference = DailyAQ_testLabels)
DailyAQ_pred_accuracy <- DailyAQ_confusionMatrix$overall['Accuracy']


xkabledply(as.matrix(DailyAQ_confusionMatrix), title = paste("ConfusionMatrix for k = ", kval))
ConfusionMatrix for k = 3
01 02 03 04 05 06 07 08 09 10 11 12
01 60 38 16 7 1 0 0 0 0 5 13 50
02 24 33 38 10 0 0 0 0 0 4 18 18
03 14 13 37 26 7 1 0 0 2 11 27 11
04 2 3 20 52 22 6 2 2 3 36 25 6
05 0 1 2 20 38 19 10 8 28 27 7 0
06 0 0 0 1 8 39 34 25 25 7 1 0
07 0 0 0 0 4 15 41 34 13 3 0 0
08 0 0 0 2 2 15 23 24 13 3 0 0
09 0 0 1 1 9 14 9 10 17 4 1 0
10 0 3 2 6 8 5 1 0 3 24 8 1
11 2 1 9 6 4 1 0 0 0 6 15 6
12 16 13 9 3 0 0 0 0 0 2 5 19
xkabledply(data.frame(DailyAQ_confusionMatrix$byClass), title=paste("k = ", kval))
k = 3
Sensitivity Specificity Pos.Pred.Value Neg.Pred.Value Precision Recall F1 Prevalence Detection.Rate Detection.Prevalence Balanced.Accuracy
Class: 01 0.508 0.898 0.316 0.952 0.316 0.508 0.390 0.0843 0.0429 0.1358 0.704
Class: 02 0.314 0.913 0.228 0.943 0.228 0.314 0.264 0.0751 0.0236 0.1036 0.614
Class: 03 0.276 0.911 0.248 0.922 0.248 0.276 0.262 0.0958 0.0264 0.1065 0.594
Class: 04 0.388 0.900 0.290 0.933 0.290 0.388 0.332 0.0958 0.0372 0.1279 0.644
Class: 05 0.369 0.906 0.237 0.948 0.237 0.369 0.289 0.0736 0.0272 0.1144 0.637
Class: 06 0.339 0.921 0.279 0.940 0.279 0.339 0.306 0.0822 0.0279 0.1001 0.630
Class: 07 0.342 0.946 0.373 0.939 0.373 0.342 0.356 0.0858 0.0293 0.0786 0.644
Class: 08 0.233 0.955 0.293 0.940 0.293 0.233 0.260 0.0736 0.0172 0.0586 0.594
Class: 09 0.164 0.962 0.258 0.935 0.258 0.164 0.200 0.0743 0.0122 0.0472 0.563
Class: 10 0.182 0.971 0.393 0.919 0.393 0.182 0.249 0.0944 0.0172 0.0436 0.576
Class: 11 0.125 0.973 0.300 0.922 0.300 0.125 0.176 0.0858 0.0107 0.0357 0.549
Class: 12 0.171 0.963 0.284 0.931 0.284 0.171 0.213 0.0793 0.0136 0.0479 0.567

How does k affect classification accuracy?

evaluateK = function(k, train_set, val_set, train_class, val_class){
  
  # Build knn with k neighbors considered.
  set.seed(1000)
  class_knn = knn(train = train_set,    #<- training set cases
                  test = val_set,       #<- test set cases
                  cl = train_class,     #<- category for classification
                  k = k)                #<- number of neighbors considered
  
  tab = table(class_knn, val_class)
  
  # Calculate the accuracy.
  accu = sum(tab[row(tab) == col(tab)]) / sum(tab)                         
  cbind(k = k, accuracy = accu)
}

# call evaluateK function for each odd k-value between 1 to 21
knn_different_k = sapply(seq(1, 21, by = 2),
                         function(x) evaluateK(x, 
                                             train_set = DailyAQ_training,
                                             val_set = DailyAQ_test,
                                             train_class = DailyAQ_trainLabels,
                                             val_class = DailyAQ_testLabels))

# Reformat the results
knn_different_k = data.frame(k = knn_different_k[1,],
                             accuracy = knn_different_k[2,])
ggplot(knn_different_k, aes(x = k, y = accuracy)) +
  geom_line(color = "orange", size = 1.5) +
  geom_point(size = 3) + 
  labs(title = "kNN Model Accuracy vs k-value",
       x = "Model k-value",
       y = "Accuracy")

It seems 13-nearest neighbors is a decent choice because that’s the greatest improvement in predictive accuracy before the incremental improvement trails off. With an accuracy of 0.32, our model predicting month based on TMAX and AQI is not a strong fit.